{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a383edd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "#问题一附件2上网业务数据集处理代码\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import warnings\n",
    "from plotnine import *\n",
    "from joypy import joyplot\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from minepy import MINE\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5f7a97d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#运行配置参数中的字体\n",
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "#运行配置参数总的轴(axes)正常显示正负号(minus)\n",
    "plt.rcParams['axes.unicode_minus'] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "165ea4d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取附件2数据\n",
    "data_2 = pd.read_excel('./data/附件2上网业务用户满意度数据.xlsx')\n",
    "data_2\n",
    "#计算每一列缺失值的占比\n",
    "def missing (df):\n",
    "    missing_number = df.isnull().sum().sort_values(ascending=False)\n",
    "    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)\n",
    "    missing_values = pd.concat([missing_number,missing_percent],axis=1,keys=['缺失值数量','缺失值所占百分比'])\n",
    "    return missing_values\n",
    " \n",
    "missing(data_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d38fe16",
   "metadata": {},
   "outputs": [],
   "source": [
    "#缺失值处理\n",
    "#“ID”，“场景备注数据”，“现象备注数据”，“APP大类备注”，“APP小类视频备注”，“APP小类游戏备注”，“APP小类上网备注”中，“ID”是没有意义的数据，其他全都是缺失数据过多无法进行填充，故选择删除！\n",
    "data_2 = data_2.drop(['用户', '场景备注数据', '现象备注数据', 'APP大类备注', 'APP小类视频备注', 'APP小类游戏备注', 'APP小类上网备注'], axis = 1)\n",
    "#“上网质差次数”和“脱网次数”缺失值中，空白的为0\n",
    "data_2['上网质差次数'].fillna(0, inplace = True)\n",
    "data_2['脱网次数'].fillna(0, inplace = True)\n",
    "#'重定向次数','2G驻留时长', '微信质差次数', '王者荣耀质差次数','游戏类APP使用流量','今日头条使用流量', '快手使用流量', '优酷视频使用流量','腾讯视频使用流量'中，空白的为0\n",
    "data_2['重定向次数'].fillna(0, inplace = True)\n",
    "data_2['2G驻留时长'].fillna(0, inplace = True)\n",
    "data_2['微信质差次数'].fillna(0, inplace = True)\n",
    "data_2['王者荣耀质差次数'].fillna(0, inplace = True)\n",
    "data_2['游戏类APP使用流量'].fillna(0, inplace = True)\n",
    "data_2['今日头条使用流量'].fillna(0, inplace = True)\n",
    "data_2['快手使用流量'].fillna(0, inplace = True)\n",
    "data_2['优酷视频使用流量'].fillna(0, inplace = True)\n",
    "data_2['腾讯视频使用流量'].fillna(0, inplace = True)\n",
    " \n",
    "data_2['王者荣耀使用次数'].value_counts()\n",
    "data_2['王者荣耀使用次数'].fillna(0.0, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "302cbee7",
   "metadata": {},
   "outputs": [],
   "source": [
    "#拉格朗日插值法\n",
    "#游戏类APP使用天数\n",
    "data_2['游戏类APP使用天数'].value_counts()\n",
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "sns.distplot(data_2['游戏类APP使用天数'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ab4b1e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.interpolate import lagrange\n",
    "def lag_fill(df,i,k):\n",
    "    r=0 if (i-k)<0 else (i-k)\n",
    "    l=len(df.index) if (i+1+k)>len(df.index) else (i+1+k)\n",
    "    y=df.loc[list(range(r,i))+list(range(i+1,l))]\n",
    "    for j in y.index:\n",
    "        if y.isnull().loc[j]:\n",
    "            y.drop(index=j,inplace=True)\n",
    "    x=y.index\n",
    "    lag=lagrange(x.values,y.values)\n",
    "    return lag(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab2db1ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "index = np.array(data_2['游戏类APP使用天数'][data_2['游戏类APP使用天数'].isnull()].index)\n",
    "nums = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84e92cd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in index:\n",
    "    num = int(lag_fill(data_2['游戏类APP使用天数'], i, 1))\n",
    "    nums.append(num)\n",
    "    \n",
    "a = data_2['游戏类APP使用天数'].copy()\n",
    "index = np.array(a[a.isnull()].index) # 缺失值的索引\n",
    "for i in range(len(index)):\n",
    "    a.loc[index[i]] = nums[i]\n",
    "a.isnull().sum()\n",
    "\n",
    "data_2['游戏类APP使用天数'] = a\n",
    "data_2['游戏类APP使用天数'].isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "edfbee75",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f5ad83f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#“游戏类APP使用次数”的缺失值处理方式\n",
    "data_2['游戏类APP使用次数'].value_counts()\n",
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "sns.distplot(data_2['游戏类APP使用次数'])\n",
    "index = np.array(data_2['游戏类APP使用次数'][data_2['游戏类APP使用次数'].isnull()].index)\n",
    "nums = []\n",
    "for i in index:\n",
    "    num = int(lag_fill(data_2['游戏类APP使用次数'], i, 1))\n",
    "    nums.append(num)\n",
    "a = data_2['游戏类APP使用次数'].copy()\n",
    "index2 = np.array(a[a.isnull()].index) # 缺失值的索引\n",
    "for i in range(len(index2)):\n",
    "    a.loc[index2[i]] = nums[i]\n",
    "data_2['游戏类APP使用次数'] = a\n",
    "data_2['游戏类APP使用次数'].isnull().sum()\n",
    "data_2['游戏类APP使用次数'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f37fdfb",
   "metadata": {},
   "outputs": [],
   "source": [
    "#“王者荣耀使用天数”的缺失值处理\n",
    "data_2['王者荣耀使用天数'].value_counts()\n",
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "sns.distplot(data_2['王者荣耀使用天数'])\n",
    "index = np.array(data_2['王者荣耀使用天数'][data_2['王者荣耀使用天数'].isnull()].index)\n",
    "nums = []\n",
    "for i in index:\n",
    "    num = int(lag_fill(data_2['王者荣耀使用天数'], i, 1))\n",
    "    nums.append(num)\n",
    "a = data_2['王者荣耀使用天数'].copy()\n",
    "index2 = np.array(a[a.isnull()].index) # 缺失值的索引\n",
    "for i in range(len(index2)):\n",
    "    a.loc[index2[i]] = nums[i]\n",
    "data_2['王者荣耀使用天数'] = a\n",
    "data_2['王者荣耀使用天数'].isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3a7d089",
   "metadata": {},
   "outputs": [],
   "source": [
    "#“终端类型”的缺失值的处理\n",
    "data_2['终端类型'].value_counts()\n",
    "data_2['终端类型'].fillna('手机', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7d591dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "#“操作系统”的缺失值处理\n",
    "data_2['操作系统'].value_counts()\n",
    "idxs_1 = ['Android 10', 'Android 9', 'Android 8.1.0', 'Android10', 'Android 8.0.0', 'Android 7.0', 'Android 11', 'Android 7.1.1', \\\n",
    "    'Android 6.0', 'Android 6.0.1', 'Android 7.1.2', 'Android9', 'Android 5.1', 'Android 9.0', 'Android8.1.0', 'android 5.0.2', \\\n",
    "        'Android 4.4.4', 'Android 5.1.1']\n",
    "for idx in idxs_1:\n",
    "    data_2['操作系统'].replace(idx, 'Android', inplace = True)\n",
    "\n",
    "idxs_2 = ['iOS 12.0', 'iOS 14.0', 'iOS 14.1', 'iOS 11.0', 'iOS 10.0', 'iOS 11.4.1', 'iOS 10.1', 'iOS 12', 'IOS', 'iOS13.3.1', \\\n",
    "    'iOS 9.0', 'iOS 8.0', 'IOS 9', 'iOS 9.3']\n",
    "for idx in idxs_2:\n",
    "    data_2['操作系统'].replace(idx, 'iOS', inplace = True)\n",
    "\n",
    "data_2['操作系统'].value_counts()\n",
    "data_2['操作系统'].fillna('Android', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8493257c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#“终端品牌”的缺失值处理\n",
    "data_2['终端品牌'].value_counts()\n",
    "data_2['终端品牌'].fillna('苹果', inplace = True)\n",
    " \n",
    "#“终端品牌类型”无用，选择删除\n",
    "data_2 = data_2.drop('终端品牌类型', axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "383f89a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#“当月高频通信分公司”的缺失值处理\n",
    "data_2['当月高频通信分公司'].value_counts()\n",
    "data_2['当月高频通信分公司'].fillna('城区二分公司', inplace = True)\n",
    " \n",
    "#“近3个月平均消费（剔除通信账户支付）”的缺失值处理\n",
    "data_2['近3个月平均消费（剔除通信账户支付）'].value_counts()\n",
    "data_2['近3个月平均消费（剔除通信账户支付）'].fillna(21.00, inplace = True)\n",
    " \n",
    "#“码号资源-激活时间”和“码号资源-发卡时间”的缺失值均接近60%，且意义不大，选择删除\n",
    "data_2 = data_2.drop(['码号资源-发卡时间', '码号资源-激活时间'], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abb0e3db",
   "metadata": {},
   "outputs": [],
   "source": [
    "#检验数据集中是否还有空值\n",
    "msn.matrix(data_2, labels = False, label_rotation = 90)\n",
    "msn.bar(data_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4232c9bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "#数据标签转化\n",
    "#数据的转化，居民小区，办公室，高校，商业街，地铁，农村，高铁：-1表示不在，1，2，3，4，5，6，7表示在\n",
    "cols = ['居民小区', '办公室', '高校', '商业街', '地铁', '农村', '高铁']\n",
    "j = 0\n",
    "for col in cols:\n",
    "        j += 1\n",
    "        data_2[col].replace(-1, '不在', inplace = True)\n",
    "        data_2[col].replace(j, '在', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35fb0cea",
   "metadata": {},
   "outputs": [],
   "source": [
    "#['其他，请注明', '其他，请注明.1', '其他，请注明.2', '其他，请注明.3', '其他，请注明.4', '其他，请注明.5']中，-1表示没有，98表示有\n",
    "data_2['其他，请注明.1'].replace(98, '有', inplace = True)\n",
    "data_2['其他，请注明.1'].replace(-1, '没有', inplace = True)\n",
    "data_2['其他，请注明'].replace(98, '有', inplace = True)\n",
    "data_2['其他，请注明'].replace(-1, '没有', inplace = True)\n",
    "data_2['其他，请注明.2'].replace(98, '有', inplace = True)\n",
    "data_2['其他，请注明.2'].replace(-1, '没有', inplace = True)\n",
    "data_2['其他，请注明.3'].replace(98, '有', inplace = True)\n",
    "data_2['其他，请注明.3'].replace(-1, '没有', inplace = True)\n",
    "data_2['其他，请注明.4'].replace(98, '有', inplace = True)\n",
    "data_2['其他，请注明.4'].replace(-1, '没有', inplace = True)\n",
    "data_2['其他，请注明.5'].replace(98, '有', inplace = True)\n",
    "data_2['其他，请注明.5'].replace(-1, '没有', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27941e3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#['网络信号差/没有信号', '显示有信号上不了网','上网过程中网络时断时续或时快时慢', '手机上网速度慢']这4个因素里面，-1表示没有，1，2，3，4表示有\n",
    "cols = ['网络信号差/没有信号', '显示有信号上不了网','上网过程中网络时断时续或时快时慢', '手机上网速度慢']\n",
    "j = 0\n",
    "for col in cols:\n",
    "        j += 1\n",
    "        data_2[col].replace(-1, '不在', inplace = True)\n",
    "        data_2[col].replace(j, '在', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "879fb961",
   "metadata": {},
   "outputs": [],
   "source": [
    "#['看视频卡顿', '打游戏延时大','打开网页或APP图片慢', '下载速度慢', '手机支付较慢']中，-1表示没有，1，2，3，4，5表示有问题\n",
    "cols = ['看视频卡顿', '打游戏延时大','打开网页或APP图片慢', '下载速度慢', '手机支付较慢']\n",
    "j = 0\n",
    "for col in cols:\n",
    "        j += 1\n",
    "        data_2[col].replace(-1, '不在', inplace = True)\n",
    "        data_2[col].replace(j, '在', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7ce485e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#['爱奇艺', '优酷', '腾讯视频','芒果TV', '搜狐视频', '抖音', '快手', '火山', '咪咕视频']中，-1表示没有，1，2，3，4，5，6，7，8，9表示有\n",
    "cols = ['爱奇艺', '优酷', '腾讯视频','芒果TV', '搜狐视频', '抖音', '快手', '火山', '咪咕视频']\n",
    "j = 0\n",
    "for col in cols:\n",
    "        j += 1\n",
    "        data_2[col].replace(-1, '不在', inplace = True)\n",
    "        data_2[col].replace(j, '在', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a213968d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#['和平精英','王者荣耀', '穿越火线', '梦幻西游', '龙之谷', '梦幻诛仙', '欢乐斗地主', '部落冲突', '炉石传说', '阴阳师']中，-1表示没有，1-10表示有\n",
    "cols = ['和平精英','王者荣耀', '穿越火线', '梦幻西游', '龙之谷', '梦幻诛仙', '欢乐斗地主', '部落冲突', '炉石传说', '阴阳师']\n",
    "j = 0\n",
    "for col in cols:\n",
    "        j += 1\n",
    "        data_2[col].replace(-1, '不在', inplace = True)\n",
    "        data_2[col].replace(j, '在', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "555c4b2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#['微信', '手机QQ', '淘宝', '京东', '百度', '今日头条', '新浪微博','拼多多']中，-1表示没有，1-7表示有\n",
    "cols = ['微信', '手机QQ', '淘宝', '京东', '百度', '今日头条', '新浪微博','拼多多']\n",
    "j = 0\n",
    "for col in cols:\n",
    "        j += 1\n",
    "        data_2[col].replace(-1, '不在', inplace = True)\n",
    "        data_2[col].replace(j, '在', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a95d5486",
   "metadata": {},
   "outputs": [],
   "source": [
    "#['全部都卡顿', '全部网页或APP都慢']中，-1表示否，99表示是\n",
    "data_2['全部都卡顿'].replace(99, '是', inplace = True)\n",
    "data_2['全部都卡顿'].replace(-1, '否', inplace = True)\n",
    "data_2['全部网页或APP都慢'].replace(99, '是', inplace = True)\n",
    "data_2['全部网页或APP都慢'].replace(-1, '否', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "863bf080",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_2.to_excel('./data/data_2处理后的数据.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1288d4b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "#根据“手机上网整体满意度”进行编码\n",
    "import category_encoders as ce\n",
    "df_1 = data_2.copy()\n",
    "target = df_1[['手机上网整体满意度']]\n",
    "train = df_1.drop(['手机上网整体满意度', '网络覆盖与信号强度', '手机上网速度', '手机上网稳定性'], axis = 1)\n",
    "cbe_encoder = ce.cat_boost.CatBoostEncoder()\n",
    "cbe_encoder.fit(train, target)\n",
    "data = pd.DataFrame(cbe_encoder.transform(train))\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "544276df",
   "metadata": {},
   "outputs": [],
   "source": [
    "#互信息\n",
    "def mic(x, y):\n",
    "    m = MINE()\n",
    "    m.compute_score(x, y)\n",
    "    return (m.mic(), 0.5)\n",
    "index_4 = np.array(data.columns)\n",
    "lst = []\n",
    "for i in index_4:\n",
    "    l = mic(data[i], df_1['手机上网整体满意度'])[0]\n",
    "    lst.append(l)\n",
    "w = pd.Series(pd.Series(lst).values, index = index_4).sort_values(ascending = False)\n",
    "w[:60]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b207e58d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#GBDT算法\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "X, y = data, df_1.iloc[:, 0]\n",
    "feat_labels = df_1.columns[4 : ]\n",
    "clf = GradientBoostingClassifier(n_estimators = 300, random_state = 10)\n",
    "clf.fit(X, y)\n",
    "var_importance = pd.Series(clf.feature_importances_, index = feat_labels).sort_values(ascending = False)\n",
    "var_importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71e405d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "index = np.array(var_importance.index)\n",
    "feature_importance = pd.DataFrame()\n",
    "feature_importance['feature_names'] = index\n",
    "feature_importance['feature_values'] = var_importance.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68561ec0",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = feature_importance.iloc[: 20, :], x = 'feature_values', y = 'feature_names', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('手机上网整体满意度', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('feature_names', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('feature_values', font={'family':'STSong', 'size':14})\n",
    "df = feature_importance.iloc[: 20, :]\n",
    "C = np.array(df['feature_values'])\n",
    "for a, b, c in zip(df['feature_values'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b72e2cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "#网络覆盖与信号强度\n",
    "df_2 = data_2.copy()\n",
    "target = df_2[['网络覆盖与信号强度']]\n",
    "train = df_1.drop(['手机上网整体满意度', '网络覆盖与信号强度', '手机上网速度', '手机上网稳定性'], axis = 1)\n",
    "cbe_encoder = ce.cat_boost.CatBoostEncoder()\n",
    "cbe_encoder.fit(train, target)\n",
    "data2 = pd.DataFrame(cbe_encoder.transform(train))\n",
    "data2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39930603",
   "metadata": {},
   "outputs": [],
   "source": [
    "#互信息\n",
    "def mic(x, y):\n",
    "    m = MINE()\n",
    "    m.compute_score(x, y)\n",
    "    return (m.mic(), 0.5)\n",
    "index_4 = np.array(data2.columns)\n",
    "lst = []\n",
    "for i in index_4:\n",
    "    l = mic(data2[i], df_2['网络覆盖与信号强度'])[0]\n",
    "    lst.append(l)\n",
    "w = pd.Series(pd.Series(lst).values, index = index_4).sort_values(ascending = False)\n",
    "w[:60]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3bd850f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "#GBDT算法\n",
    "X, y = data2, df_2.iloc[:, 1]\n",
    "feat_labels = df_2.columns[4 : ]\n",
    "clf2 = GradientBoostingClassifier(n_estimators = 300, random_state = 10)\n",
    "clf2.fit(X, y)\n",
    "var_importance2 = pd.Series(clf2.feature_importances_, index = feat_labels).sort_values(ascending = False)\n",
    "var_importance2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "224d5032",
   "metadata": {},
   "outputs": [],
   "source": [
    "index2 = np.array(var_importance2.index)\n",
    "feature_importance2 = pd.DataFrame()\n",
    "feature_importance2['feature_names'] = index2\n",
    "feature_importance2['feature_values'] = var_importance2.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9c028e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = feature_importance2.iloc[: 20, :], x = 'feature_values', y = 'feature_names', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('网络覆盖与信号强度', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('feature_names', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('feature_values', font={'family':'STSong', 'size':14})\n",
    "df = feature_importance2.iloc[: 20, :]\n",
    "C = np.array(df['feature_values'])\n",
    "for a, b, c in zip(df['feature_values'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b42fee4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#手机上网速度\n",
    "df_3 = data_2.copy()\n",
    "target = df_3[['手机上网速度']]\n",
    "train = df_1.drop(['手机上网整体满意度', '网络覆盖与信号强度', '手机上网速度', '手机上网稳定性'], axis = 1)\n",
    "cbe_encoder = ce.cat_boost.CatBoostEncoder()\n",
    "cbe_encoder.fit(train, target)\n",
    "data3 = pd.DataFrame(cbe_encoder.transform(train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16638391",
   "metadata": {},
   "outputs": [],
   "source": [
    "#互信息\n",
    "def mic(x, y):\n",
    "    m = MINE()\n",
    "    m.compute_score(x, y)\n",
    "    return (m.mic(), 0.5)\n",
    "index_4 = np.array(data3.columns)\n",
    "lst = []\n",
    "for i in index_4:\n",
    "    l = mic(data3[i], df_3['手机上网速度'])[0]\n",
    "    lst.append(l)\n",
    "w = pd.Series(pd.Series(lst).values, index = index_4).sort_values(ascending = False)\n",
    "w[:60]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab306622",
   "metadata": {},
   "outputs": [],
   "source": [
    "#GBDT算法\n",
    "X, y = data3, df_3.iloc[:, 2]\n",
    "feat_labels = df_3.columns[4 : ]\n",
    "clf3 = GradientBoostingClassifier(n_estimators = 300, random_state = 10)\n",
    "clf3.fit(X, y)\n",
    "var_importance3 = pd.Series(clf3.feature_importances_, index = feat_labels).sort_values(ascending = False)\n",
    "var_importance3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6bdcdd4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "index3 = np.array(var_importance3.index)\n",
    "feature_importance3 = pd.DataFrame()\n",
    "feature_importance3['feature_names'] = index3\n",
    "feature_importance3['feature_values'] = var_importance3.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8eea6e91",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = feature_importance3.iloc[: 20, :], x = 'feature_values', y = 'feature_names', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('手机上网速度', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('feature_names', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('feature_values', font={'family':'STSong', 'size':14})\n",
    "df = feature_importance3.iloc[: 20, :]\n",
    "C = np.array(df['feature_values'])\n",
    "for a, b, c in zip(df['feature_values'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f307b0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#手机上网稳定性\n",
    "df_4 = data_2.copy()\n",
    "target = df_4[['手机上网稳定性']]\n",
    "train = df_1.drop(['手机上网整体满意度', '网络覆盖与信号强度', '手机上网速度', '手机上网稳定性'], axis = 1)\n",
    "cbe_encoder = ce.cat_boost.CatBoostEncoder()\n",
    "cbe_encoder.fit(train, target)\n",
    "data4 = pd.DataFrame(cbe_encoder.transform(train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1a8f198",
   "metadata": {},
   "outputs": [],
   "source": [
    "#互信息\n",
    "def entropy(data):\n",
    "    a = pd.value_counts(data) / len(data)\n",
    "    return np.sum(np.log2(a) * a * (-1)) / (np.log2(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d5c971a",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = np.array(df_4.columns)[4 : ]\n",
    "entropys = []\n",
    "for column in columns:\n",
    "    entro = entropy(data4[column])\n",
    "    entropys.append(entro)\n",
    "sums = 0\n",
    "for i in entropys:\n",
    "    sums += i\n",
    "weights = []\n",
    "for i in range(len(entropys)):\n",
    "    weight = entropys[i] / sums\n",
    "    weights.append(weight)\n",
    "w = pd.Series(pd.Series(weights).values, index = columns).sort_values(ascending = False)\n",
    "w[:60]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11f6f189",
   "metadata": {},
   "outputs": [],
   "source": [
    "def mic(x, y):\n",
    "    m = MINE()\n",
    "    m.compute_score(x, y)\n",
    "    return (m.mic(), 0.5)\n",
    " \n",
    "index_4 = np.array(data4.columns)\n",
    "lst = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b6b598f",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in index_4:\n",
    "    l = mic(data4[i], df_4['手机上网稳定性'])[0]\n",
    "    lst.append(l)\n",
    "w = pd.Series(pd.Series(lst).values, index = index_4).sort_values(ascending = False)\n",
    "w[:60]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdaebf6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#GBDT算法\n",
    "X, y = data4, df_4.iloc[:, 2]\n",
    "feat_labels = df_4.columns[4 : ]\n",
    "clf4 = GradientBoostingClassifier(n_estimators = 300, random_state = 10)\n",
    "clf4.fit(X, y)\n",
    "var_importance4 = pd.Series(clf4.feature_importances_, index = feat_labels).sort_values(ascending = False)\n",
    "var_importance4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24739f2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "index4 = np.array(var_importance4.index)\n",
    "feature_importance4 = pd.DataFrame()\n",
    "feature_importance4['feature_names'] = index4\n",
    "feature_importance4['feature_values'] = var_importance4.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50c235f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif'] = ['STSong']\n",
    "plt.rcParams['figure.dpi'] = 1000\n",
    "g = sns.catplot(data = feature_importance4.iloc[: 20, :], x = 'feature_values', y = 'feature_names', kind = 'bar', height = 8, aspect = 1.875)\n",
    "plt.title('手机上网稳定性', font={'family' : 'STSong', 'size' : 14})\n",
    "plt.ylabel('feature_names', font={'family':'STSong', 'size':14})\n",
    "plt.xlabel('feature_values', font={'family':'STSong', 'size':14})\n",
    "df = feature_importance4.iloc[: 20, :]\n",
    "C = np.array(df['feature_values'])\n",
    "for a, b, c in zip(df['feature_values'], range(20), C):\n",
    "\tplt.text(a + 0.0001, b, '%.3f'%c, fontsize = 14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98603e02",
   "metadata": {},
   "outputs": [],
   "source": [
    "#相关系数图\n",
    "data_2.iloc[:,1:5].corr()\n",
    "# Plot\n",
    "def plot_corr(y_X):\n",
    "    plt.figure(figsize=(18,3.4), dpi= 1000)\n",
    "    titles = ['pearson','kendall','spearman']\n",
    "    for i,title in enumerate(titles):\n",
    "        plt.subplot(1, 3, i+1)\n",
    "        sns.heatmap(y_X.corr(title),  # 相关系数矩阵\n",
    "                    xticklabels=y_X.corr().columns, # 横坐标标签\n",
    "                    yticklabels=y_X.corr().columns, # 横坐标标签\n",
    "                    cmap='copper', \n",
    "                    center=0, \n",
    "                    square=True,\n",
    "                    annot=True\n",
    "                    ) \n",
    "        \n",
    "        plt.title(label=title+'相关系数', fontsize=12)\n",
    "        plt.xticks(fontsize=10,\n",
    "                   rotation=45,# 旋转\n",
    "                   horizontalalignment='right') # 刻度的相对位置\n",
    "        plt.yticks(fontsize=10,rotation=0,)\n",
    "    plt.show()\n",
    "plot_corr(data_2.iloc[ :,1:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17b36863",
   "metadata": {},
   "outputs": [],
   "source": [
    "#热力图\n",
    "data2 = data.loc[:,index[:20]]\n",
    "plt.figure(figsize = (15, 15), dpi = 1000)\n",
    "sns.heatmap(data2.corr(), cmap='YlGnBu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4928117c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#量化分析图\n",
    "#网络信号差/没有信号\n",
    "data_2['网络信号差/没有信号'].replace('不在', '否', inplace = True)\n",
    "data_2['网络信号差/没有信号'].replace('在', '是', inplace = True)\n",
    "data_2['网络信号差/没有信号']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be2efc01",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize = (15, 15), dpi = 1000)\n",
    "plt.subplot(4, 2, 1)\n",
    "sns.countplot(data = data_2, x = '手机上网整体满意度', hue = '网络信号差/没有信号')\n",
    "df1 = data_2.groupby(['手机上网整体满意度', '网络信号差/没有信号']).size().reset_index(name='Count')\n",
    "no1 = (df1.loc[df1['网络信号差/没有信号'] == '否']).loc[:,'Count']\n",
    "yes1 = (df1.loc[df1['网络信号差/没有信号'] == '是']).loc[:,'Count']\n",
    "x1 = list(range(len(yes1)))\n",
    "for a,b in zip(x1, yes1): ##控制标签位置\n",
    "    plt.text(a+0.18, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x1, no1):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7525449",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.subplot(4, 2, 2)\n",
    "sns.countplot(data = data_2, x = '网络覆盖与信号强度', hue = '网络信号差/没有信号')\n",
    "df2 = data_2.groupby(['网络覆盖与信号强度', '网络信号差/没有信号']).size().reset_index(name='Count')\n",
    "no2 = (df2.loc[df2['网络信号差/没有信号'] == '否']).loc[:,'Count']\n",
    "yes2 = (df2.loc[df2['网络信号差/没有信号'] == '是']).loc[:,'Count']\n",
    "x2 = list(range(len(yes2)))\n",
    "for a,b in zip(x2, yes2): ##控制标签位置\n",
    "    plt.text(a+0.18,b+0.1,'%.0f'%b,ha = 'center',va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x2, no2):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25dc13cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.subplot(4, 2, 3)\n",
    "sns.countplot(data = data_2, x = '手机上网速度', hue = '网络信号差/没有信号')\n",
    "df3 = data_2.groupby(['手机上网速度', '网络信号差/没有信号']).size().reset_index(name='Count')\n",
    "no3 = (df3.loc[df3['网络信号差/没有信号'] == '否']).loc[:,'Count']\n",
    "yes3 = (df3.loc[df3['网络信号差/没有信号'] == '是']).loc[:,'Count']\n",
    "x3 = list(range(len(yes3)))\n",
    "for a,b in zip(x3, yes3): ##控制标签位置\n",
    "    plt.text(a+0.18, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x3, no3):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56acb751",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.subplot(4, 2, 4)\n",
    "sns.countplot(data = data_2, x = '手机上网稳定性', hue = '网络信号差/没有信号')\n",
    "df4 = data_2.groupby(['手机上网稳定性', '网络信号差/没有信号']).size().reset_index(name='Count')\n",
    "no4 = (df4.loc[df4['网络信号差/没有信号'] == '否']).loc[:,'Count']\n",
    "yes4 = (df4.loc[df4['网络信号差/没有信号'] == '是']).loc[:,'Count']\n",
    "x4 = list(range(len(yes4)))\n",
    "for a,b in zip(x4, yes4): ##控制标签位置\n",
    "    plt.text(a+0.18, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x4, no4):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e80a8982",
   "metadata": {},
   "outputs": [],
   "source": [
    "#重定向次数\n",
    "max1 = (data_2.loc[data_2.groupby('手机上网整体满意度')['重定向次数'].idxmax(), :].reset_index())['重定向次数']\n",
    "min1 = (data_2.loc[data_2.groupby('手机上网整体满意度')['重定向次数'].idxmin(), :].reset_index())['重定向次数']\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df1 = pd.concat([score,min1,max1], axis = 1)\n",
    "df1.columns = ['手机上网整体满意度','重定向次数_min','重定向次数_max']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "549c92b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "max2 = (data_2.loc[data_2.groupby('网络覆盖与信号强度')['重定向次数'].idxmax(), :].reset_index())['重定向次数']\n",
    "min2 = (data_2.loc[data_2.groupby('网络覆盖与信号强度')['重定向次数'].idxmin(), :].reset_index())['重定向次数']\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df2 = pd.concat([score,min2,max2], axis = 1)\n",
    "df2.columns = ['网络覆盖与信号强度','重定向次数_min','重定向次数_max']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae68d09a",
   "metadata": {},
   "outputs": [],
   "source": [
    "max3 = (data_2.loc[data_2.groupby('手机上网速度')['重定向次数'].idxmax(), :].reset_index())['重定向次数']\n",
    "min3 = (data_2.loc[data_2.groupby('手机上网速度')['重定向次数'].idxmin(), :].reset_index())['重定向次数']\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df3 = pd.concat([score,min3,max3], axis = 1)\n",
    "df3.columns = ['手机上网速度','重定向次数_min','重定向次数_max']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b19039d",
   "metadata": {},
   "outputs": [],
   "source": [
    "max4 = (data_2.loc[data_2.groupby('手机上网稳定性')['重定向次数'].idxmax(), :].reset_index())['重定向次数']\n",
    "min4 = (data_2.loc[data_2.groupby('手机上网稳定性')['重定向次数'].idxmin(), :].reset_index())['重定向次数']\n",
    "score = pd.DataFrame(np.arange(1, 11))\n",
    "df4 = pd.concat([score,min4,max4], axis = 1)\n",
    "df4.columns = ['手机上网稳定性','重定向次数_min','重定向次数_max']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8df2fbb9",
   "metadata": {},
   "outputs": [],
   "source": [
    "newdf = pd.concat([df1['手机上网整体满意度'], df1['重定向次数_max'], df2['重定向次数_max'], df3['重定向次数_max'], df4['重定向次数_max']], axis=1)\n",
    "newdf.columns = ['打分', '手机上网整体满意度', '网络覆盖与信号强度', '手机上网速度', '手机上网稳定性']\n",
    "newdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6692026",
   "metadata": {},
   "outputs": [],
   "source": [
    "#上网过程中网络时断时续或时快时慢\n",
    "data_2['上网过程中网络时断时续或时快时慢'].replace('不在', '否', inplace = True)\n",
    "data_2['上网过程中网络时断时续或时快时慢'].replace('在', '是', inplace = True)\n",
    "data_2['上网过程中网络时断时续或时快时慢']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef5c4e09",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize = (15, 15), dpi = 1000)\n",
    "plt.subplot(4, 2, 1)\n",
    "sns.countplot(data = data_2, x = '手机上网整体满意度', hue = '上网过程中网络时断时续或时快时慢')\n",
    "df1 = data_2.groupby(['手机上网整体满意度', '上网过程中网络时断时续或时快时慢']).size().reset_index(name='Count')\n",
    "no1 = (df1.loc[df1['上网过程中网络时断时续或时快时慢'] == '否']).loc[:,'Count']\n",
    "yes1 = (df1.loc[df1['上网过程中网络时断时续或时快时慢'] == '是']).loc[:,'Count']\n",
    "x1 = list(range(len(yes1)))\n",
    "for a,b in zip(x1, yes1): ##控制标签位置\n",
    "    plt.text(a+0.18, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x1, no1):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cfd89e4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.subplot(4, 2, 2)\n",
    "sns.countplot(data = data_2, x = '网络覆盖与信号强度', hue = '上网过程中网络时断时续或时快时慢')\n",
    "df2 = data_2.groupby(['网络覆盖与信号强度', '上网过程中网络时断时续或时快时慢']).size().reset_index(name='Count')\n",
    "no2 = (df2.loc[df2['上网过程中网络时断时续或时快时慢'] == '否']).loc[:,'Count']\n",
    "yes2 = (df2.loc[df2['上网过程中网络时断时续或时快时慢'] == '是']).loc[:,'Count']\n",
    "x2 = list(range(len(yes2)))\n",
    "for a,b in zip(x2, yes2): ##控制标签位置\n",
    "    plt.text(a+0.18,b+0.1,'%.0f'%b,ha = 'center',va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x2, no2):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34960650",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.subplot(4, 2, 3)\n",
    "sns.countplot(data = data_2, x = '手机上网速度', hue = '上网过程中网络时断时续或时快时慢')\n",
    "df3 = data_2.groupby(['手机上网速度', '上网过程中网络时断时续或时快时慢']).size().reset_index(name='Count')\n",
    "no3 = (df3.loc[df3['上网过程中网络时断时续或时快时慢'] == '否']).loc[:,'Count']\n",
    "yes3 = (df3.loc[df3['上网过程中网络时断时续或时快时慢'] == '是']).loc[:,'Count']\n",
    "x3 = list(range(len(yes3)))\n",
    "for a,b in zip(x3, yes3): ##控制标签位置\n",
    "    plt.text(a+0.18, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x3, no3):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab628cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.subplot(4, 2, 4)\n",
    "sns.countplot(data = data_2, x = '手机上网稳定性', hue = '上网过程中网络时断时续或时快时慢')\n",
    "df4 = data_2.groupby(['手机上网稳定性', '上网过程中网络时断时续或时快时慢']).size().reset_index(name='Count')\n",
    "no4 = (df4.loc[df4['上网过程中网络时断时续或时快时慢'] == '否']).loc[:,'Count']\n",
    "yes4 = (df4.loc[df4['上网过程中网络时断时续或时快时慢'] == '是']).loc[:,'Count']\n",
    "x4 = list(range(len(yes4)))\n",
    "for a,b in zip(x4, yes4): ##控制标签位置\n",
    "    plt.text(a+0.18, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)\n",
    "for a,b in zip(x4, no4):\n",
    "    plt.text(a-0.2, b+0.1, '%.0f'%b, ha = 'center', va = 'bottom', fontsize = 8)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
